Question 1:
library(data.table)
# read 2004 data
data_2004 <- fread("data/data_2004.csv")
pm_2004 <- data.table(data_2004, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
# read 2019 data
data_2019 <- fread("data/data_2019.csv")
pm_2019 <- data.table(data_2019, keep.rownames=FALSE, check.names=FALSE, key=NULL, stringsAsFactors=FALSE)
# check 2004 data
# check the dimension
dim(pm_2004)
## [1] 19233 20
# check the header
head(pm_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2004 AQS 60010007 1 8.9 ug/m3 LC
## 2: 01/02/2004 AQS 60010007 1 12.2 ug/m3 LC
## 3: 01/03/2004 AQS 60010007 1 16.5 ug/m3 LC
## 4: 01/04/2004 AQS 60010007 1 19.5 ug/m3 LC
## 5: 01/05/2004 AQS 60010007 1 11.5 ug/m3 LC
## 6: 01/06/2004 AQS 60010007 1 32.5 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 37 Livermore 1 100
## 2: 51 Livermore 1 100
## 3: 60 Livermore 1 100
## 4: 67 Livermore 1 100
## 5: 48 Livermore 1 100
## 6: 94 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 3: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 4: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 5: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## 6: 88502 Acceptable PM2.5 AQI & Speciation Mass 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
# check the footer
tail(pm_2004)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 12/14/2004 AQS 61131003 1 11 ug/m3 LC
## 2: 12/17/2004 AQS 61131003 1 16 ug/m3 LC
## 3: 12/20/2004 AQS 61131003 1 17 ug/m3 LC
## 4: 12/23/2004 AQS 61131003 1 9 ug/m3 LC
## 5: 12/26/2004 AQS 61131003 1 24 ug/m3 LC
## 6: 12/29/2004 AQS 61131003 1 9 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 46 Woodland-Gibson Road 1 100
## 2: 59 Woodland-Gibson Road 1 100
## 3: 61 Woodland-Gibson Road 1 100
## 4: 38 Woodland-Gibson Road 1 100
## 5: 76 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
# check the variable name
colnames(pm_2004)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
# check the variable type
str(pm_2004)
## Classes 'data.table' and 'data.frame': 19233 obs. of 20 variables:
## $ Date : chr "01/01/2004" "01/02/2004" "01/03/2004" "01/04/2004" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Daily Mean PM2.5 Concentration: num 8.9 12.2 16.5 19.5 11.5 32.5 15.5 29.9 21 15.7 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 37 51 60 67 48 94 58 88 70 59 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88502 88502 88502 88502 88502 88502 88502 88502 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" "Acceptable PM2.5 AQI & Speciation Mass" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
# check the summary
summary(pm_2004)
## Date Source Site ID POC
## Length:19233 Length:19233 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60370002 1st Qu.: 1.000
## Mode :character Mode :character Median :60658001 Median : 1.000
## Mean :60588026 Mean : 1.816
## 3rd Qu.:60750006 3rd Qu.: 2.000
## Max. :61131003 Max. :12.000
##
## Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
## Min. : -0.10 Length:19233 Min. : 0.00
## 1st Qu.: 6.00 Class :character 1st Qu.: 25.00
## Median : 10.10 Mode :character Median : 42.00
## Mean : 13.13 Mean : 46.32
## 3rd Qu.: 16.30 3rd Qu.: 60.00
## Max. :251.00 Max. :301.00
##
## Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
## Length:19233 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88267
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
## Length:19233 Min. :12540 Length:19233 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35328 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :1253
## STATE COUNTY_CODE COUNTY SITE_LATITUDE
## Length:19233 Min. : 1.00 Length:19233 Min. :32.63
## Class :character 1st Qu.: 37.00 Class :character 1st Qu.:34.07
## Mode :character Median : 65.00 Mode :character Median :36.48
## Mean : 58.63 Mean :36.23
## 3rd Qu.: 75.00 3rd Qu.:38.10
## Max. :113.00 Max. :41.71
##
## SITE_LONGITUDE
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.3
## Mean :-119.7
## 3rd Qu.:-117.9
## Max. :-115.5
##
For pm_2004 data table, there are 19233 rows and 20 columns with 20 variables. For all those variables, there are 8 variables with data type “chr”, 8 variables with data type “int”, and 4 variables with data type “num”. The mean for PM2.5 Concentration in 2004 is 13.13, and the mean of daily air quality index value is 46.32
# check 2019 data
# check the dimension
dim(pm_2019)
## [1] 53156 20
# check the header
head(pm_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 01/01/2019 AQS 60010007 3 5.7 ug/m3 LC
## 2: 01/02/2019 AQS 60010007 3 11.9 ug/m3 LC
## 3: 01/03/2019 AQS 60010007 3 20.1 ug/m3 LC
## 4: 01/04/2019 AQS 60010007 3 28.8 ug/m3 LC
## 5: 01/05/2019 AQS 60010007 3 11.2 ug/m3 LC
## 6: 01/06/2019 AQS 60010007 3 2.7 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 24 Livermore 1 100
## 2: 50 Livermore 1 100
## 3: 68 Livermore 1 100
## 4: 86 Livermore 1 100
## 5: 47 Livermore 1 100
## 6: 11 Livermore 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 41860
## 2: 88101 PM2.5 - Local Conditions 41860
## 3: 88101 PM2.5 - Local Conditions 41860
## 4: 88101 PM2.5 - Local Conditions 41860
## 5: 88101 PM2.5 - Local Conditions 41860
## 6: 88101 PM2.5 - Local Conditions 41860
## CBSA_NAME STATE_CODE STATE COUNTY_CODE COUNTY
## 1: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 2: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 3: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 4: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 5: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## 6: San Francisco-Oakland-Hayward, CA 6 California 1 Alameda
## SITE_LATITUDE SITE_LONGITUDE
## 1: 37.68753 -121.7842
## 2: 37.68753 -121.7842
## 3: 37.68753 -121.7842
## 4: 37.68753 -121.7842
## 5: 37.68753 -121.7842
## 6: 37.68753 -121.7842
# check the footer
tail(pm_2019)
## Date Source Site ID POC Daily Mean PM2.5 Concentration UNITS
## 1: 11/11/2019 AQS 61131003 1 13.5 ug/m3 LC
## 2: 11/17/2019 AQS 61131003 1 18.1 ug/m3 LC
## 3: 11/29/2019 AQS 61131003 1 12.5 ug/m3 LC
## 4: 12/17/2019 AQS 61131003 1 23.8 ug/m3 LC
## 5: 12/23/2019 AQS 61131003 1 1.0 ug/m3 LC
## 6: 12/29/2019 AQS 61131003 1 9.1 ug/m3 LC
## DAILY_AQI_VALUE Site Name DAILY_OBS_COUNT PERCENT_COMPLETE
## 1: 54 Woodland-Gibson Road 1 100
## 2: 64 Woodland-Gibson Road 1 100
## 3: 52 Woodland-Gibson Road 1 100
## 4: 76 Woodland-Gibson Road 1 100
## 5: 4 Woodland-Gibson Road 1 100
## 6: 38 Woodland-Gibson Road 1 100
## AQS_PARAMETER_CODE AQS_PARAMETER_DESC CBSA_CODE
## 1: 88101 PM2.5 - Local Conditions 40900
## 2: 88101 PM2.5 - Local Conditions 40900
## 3: 88101 PM2.5 - Local Conditions 40900
## 4: 88101 PM2.5 - Local Conditions 40900
## 5: 88101 PM2.5 - Local Conditions 40900
## 6: 88101 PM2.5 - Local Conditions 40900
## CBSA_NAME STATE_CODE STATE COUNTY_CODE
## 1: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 2: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 3: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 4: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 5: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## 6: Sacramento--Roseville--Arden-Arcade, CA 6 California 113
## COUNTY SITE_LATITUDE SITE_LONGITUDE
## 1: Yolo 38.66121 -121.7327
## 2: Yolo 38.66121 -121.7327
## 3: Yolo 38.66121 -121.7327
## 4: Yolo 38.66121 -121.7327
## 5: Yolo 38.66121 -121.7327
## 6: Yolo 38.66121 -121.7327
# check the variable name
colnames(pm_2019)
## [1] "Date" "Source"
## [3] "Site ID" "POC"
## [5] "Daily Mean PM2.5 Concentration" "UNITS"
## [7] "DAILY_AQI_VALUE" "Site Name"
## [9] "DAILY_OBS_COUNT" "PERCENT_COMPLETE"
## [11] "AQS_PARAMETER_CODE" "AQS_PARAMETER_DESC"
## [13] "CBSA_CODE" "CBSA_NAME"
## [15] "STATE_CODE" "STATE"
## [17] "COUNTY_CODE" "COUNTY"
## [19] "SITE_LATITUDE" "SITE_LONGITUDE"
# check the variable type
str(pm_2019)
## Classes 'data.table' and 'data.frame': 53156 obs. of 20 variables:
## $ Date : chr "01/01/2019" "01/02/2019" "01/03/2019" "01/04/2019" ...
## $ Source : chr "AQS" "AQS" "AQS" "AQS" ...
## $ Site ID : int 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 60010007 ...
## $ POC : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Daily Mean PM2.5 Concentration: num 5.7 11.9 20.1 28.8 11.2 2.7 2.8 7 3.1 7.1 ...
## $ UNITS : chr "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" "ug/m3 LC" ...
## $ DAILY_AQI_VALUE : int 24 50 68 86 47 11 12 29 13 30 ...
## $ Site Name : chr "Livermore" "Livermore" "Livermore" "Livermore" ...
## $ DAILY_OBS_COUNT : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PERCENT_COMPLETE : num 100 100 100 100 100 100 100 100 100 100 ...
## $ AQS_PARAMETER_CODE : int 88101 88101 88101 88101 88101 88101 88101 88101 88101 88101 ...
## $ AQS_PARAMETER_DESC : chr "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" "PM2.5 - Local Conditions" ...
## $ CBSA_CODE : int 41860 41860 41860 41860 41860 41860 41860 41860 41860 41860 ...
## $ CBSA_NAME : chr "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" "San Francisco-Oakland-Hayward, CA" ...
## $ STATE_CODE : int 6 6 6 6 6 6 6 6 6 6 ...
## $ STATE : chr "California" "California" "California" "California" ...
## $ COUNTY_CODE : int 1 1 1 1 1 1 1 1 1 1 ...
## $ COUNTY : chr "Alameda" "Alameda" "Alameda" "Alameda" ...
## $ SITE_LATITUDE : num 37.7 37.7 37.7 37.7 37.7 ...
## $ SITE_LONGITUDE : num -122 -122 -122 -122 -122 ...
## - attr(*, ".internal.selfref")=<externalptr>
# check the summary
summary(pm_2019)
## Date Source Site ID POC
## Length:53156 Length:53156 Min. :60010007 Min. : 1.000
## Class :character Class :character 1st Qu.:60310004 1st Qu.: 1.000
## Mode :character Mode :character Median :60612003 Median : 3.000
## Mean :60565264 Mean : 2.573
## 3rd Qu.:60771002 3rd Qu.: 3.000
## Max. :61131003 Max. :21.000
##
## Daily Mean PM2.5 Concentration UNITS DAILY_AQI_VALUE
## Min. : -2.200 Length:53156 Min. : 0.00
## 1st Qu.: 4.000 Class :character 1st Qu.: 17.00
## Median : 6.500 Mode :character Median : 27.00
## Mean : 7.738 Mean : 30.57
## 3rd Qu.: 9.900 3rd Qu.: 41.00
## Max. :120.900 Max. :185.00
##
## Site Name DAILY_OBS_COUNT PERCENT_COMPLETE AQS_PARAMETER_CODE
## Length:53156 Min. :1 Min. :100 Min. :88101
## Class :character 1st Qu.:1 1st Qu.:100 1st Qu.:88101
## Mode :character Median :1 Median :100 Median :88101
## Mean :1 Mean :100 Mean :88214
## 3rd Qu.:1 3rd Qu.:100 3rd Qu.:88502
## Max. :1 Max. :100 Max. :88502
##
## AQS_PARAMETER_DESC CBSA_CODE CBSA_NAME STATE_CODE
## Length:53156 Min. :12540 Length:53156 Min. :6
## Class :character 1st Qu.:31080 Class :character 1st Qu.:6
## Mode :character Median :40140 Mode :character Median :6
## Mean :35839 Mean :6
## 3rd Qu.:41860 3rd Qu.:6
## Max. :49700 Max. :6
## NA's :4181
## STATE COUNTY_CODE COUNTY SITE_LATITUDE
## Length:53156 Min. : 1.00 Length:53156 Min. :32.58
## Class :character 1st Qu.: 31.00 Class :character 1st Qu.:34.14
## Mode :character Median : 61.00 Mode :character Median :36.63
## Mean : 56.38 Mean :36.34
## 3rd Qu.: 77.00 3rd Qu.:37.97
## Max. :113.00 Max. :41.76
##
## SITE_LONGITUDE
## Min. :-124.2
## 1st Qu.:-121.6
## Median :-119.8
## Mean :-119.8
## 3rd Qu.:-118.1
## Max. :-115.5
##
For pm_2019 data table, there are 53156 rows and 20 columns with 20 variables. For all those variables, there are 8 variables with data type “chr”, 8 variables with data type “int”, and 4 variables with data type “num”. The mean for PM2.5 Concentration in 2019 is 7.738, and the mean of daily air quality index value is 30.57
Question 2:
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, first, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Combine the two years of data into one data frame
pm <- rbind(pm_2004, pm_2019)
# create a new column for year, which will serve as an identifier
pm <- mutate(pm, year = factor(rep(c(2004, 2019), c(nrow(pm_2004), nrow(pm_2019)))))%>%
rename(lat = SITE_LATITUDE, lon = SITE_LONGITUDE, pm2.5 = "Daily Mean PM2.5 Concentration")
Question 3:
library(leaflet)
#filter locations and year of the table to create a new table
pm_new <- pm
pm_new = pm_new%>%
select(lat,lon,year)
pm_new$year = as.numeric(as.character(pm_new$year))
# Generating a color palette
pm.pal <- colorNumeric(c('darkgreen','goldenrod','brown'), domain=pm_new$year)
# Create a basic map in leaflet() that shows the locations of the sites
pmmap <- leaflet(pm_new) %>%
# The looks of the Map
addProviderTiles('CartoDB.Positron') %>%
# Some circles
addCircles(
lat = ~lat, lng=~lon,
# HERE IS OUR PAL!
label = ~paste0(round(year), ' C'), color = ~ pm.pal(year),
opacity = 1, fillOpacity = 1, radius = 500
) %>%
# And a pretty legend
addLegend('bottomleft', pal=pm.pal, values=pm_new$year,
title='Year', opacity=1)
pmmap
Question 4:
# Check for any missing values of PM in the combined dataset
which(is.na(pm$pm2.5))
## integer(0)
# Check for any implausible values of PM in the combined dataset
which(is.nan(pm$pm2.5))
## integer(0)
There is no missing or implausible values in the combined dataset
Question 5:
# create a new table contains three different spatial levels
pm_spatial= pm%>%
select(STATE,COUNTY,`Site Name`)
# combine state and county together as a new column
pm_spatial <- mutate(pm_spatial, location = paste(STATE, COUNTY, sep = "."))
# create the boxplot
library(ggplot2)